Install/Import Required Packages
suppressMessages({
if(!require(knitr)){install.packages("knitr")}
if(!require(ggplot2)){install.packages("ggplot2")}
if(!require(rpart)){install.packages("rpart")}
if(!require(rpart.plot)){install.packages("rpart.plot")}
if(!require(randomForest)){install.packages("randomForest")}
if(!require(caret)){install.packages("caret")}
if(!require(factoextra)){install.packages("factoextra")}
if(!require(FactoMineR)){install.packages("FactoMineR")}
if (!require("pROC")) {install.packages("pROC")}
if (!require("ggdendro")) {install.packages("ggdendro")}
if (!require("mlbench")) {install.packages("mlbench")}
if (!require("BiocManager", quietly = TRUE)) {install.packages("BiocManager")}
if (!require("GEOquery", quietly = TRUE)) {install.packages("GEOquery")}
if (!require("pheatmap", quietly = TRUE)) {install.packages("pheatmap")}
library(knitr)
library(ggplot2)
library(rpart)
library(rpart.plot)
library(randomForest)
library(caret)
library(factoextra)
library(FactoMineR)
library(pROC)
library(ggdendro)
library(mlbench)
library(BiocManager)
library(GEOquery)
library(pheatmap)
})
We will go through two regression techniques in this session:
mtcars**As a simple data set, we will use the “mtcars” data set in
R. The following code can be used to display the data
set.**
library(knitr)
data("mtcars")
kable(head(mtcars, 10),caption = "Motor Trend Car Road Tests")
| mpg | cyl | disp | hp | drat | wt | qsec | vs | am | gear | carb | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Mazda RX4 | 21.0 | 6 | 160.0 | 110 | 3.90 | 2.620 | 16.46 | 0 | 1 | 4 | 4 |
| Mazda RX4 Wag | 21.0 | 6 | 160.0 | 110 | 3.90 | 2.875 | 17.02 | 0 | 1 | 4 | 4 |
| Datsun 710 | 22.8 | 4 | 108.0 | 93 | 3.85 | 2.320 | 18.61 | 1 | 1 | 4 | 1 |
| Hornet 4 Drive | 21.4 | 6 | 258.0 | 110 | 3.08 | 3.215 | 19.44 | 1 | 0 | 3 | 1 |
| Hornet Sportabout | 18.7 | 8 | 360.0 | 175 | 3.15 | 3.440 | 17.02 | 0 | 0 | 3 | 2 |
| Valiant | 18.1 | 6 | 225.0 | 105 | 2.76 | 3.460 | 20.22 | 1 | 0 | 3 | 1 |
| Duster 360 | 14.3 | 8 | 360.0 | 245 | 3.21 | 3.570 | 15.84 | 0 | 0 | 3 | 4 |
| Merc 240D | 24.4 | 4 | 146.7 | 62 | 3.69 | 3.190 | 20.00 | 1 | 0 | 4 | 2 |
| Merc 230 | 22.8 | 4 | 140.8 | 95 | 3.92 | 3.150 | 22.90 | 1 | 0 | 4 | 2 |
| Merc 280 | 19.2 | 6 | 167.6 | 123 | 3.92 | 3.440 | 18.30 | 1 | 0 | 4 | 4 |
**Let’s Display the data set for Miles per Gallon
(response) and Weight (predictor).**
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
mtcars_plot = ggplot(mtcars, aes(wt, mpg)) +
geom_point() + xlab('Weight (1000 lbs)') + ylab('Miles Per Gallon')
ggplotly(mtcars_plot)
R function lm is used model Linear
Regression in R. Miles per Gallon (response) and
Weight (predictor).
simple_linear_regression = lm(formula = mpg ~ wt ,data = mtcars)
summary(simple_linear_regression)
##
## Call:
## lm(formula = mpg ~ wt, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5432 -2.3647 -0.1252 1.4096 6.8727
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.2851 1.8776 19.858 < 2e-16 ***
## wt -5.3445 0.5591 -9.559 1.29e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.046 on 30 degrees of freedom
## Multiple R-squared: 0.7528, Adjusted R-squared: 0.7446
## F-statistic: 91.38 on 1 and 30 DF, p-value: 1.294e-10
library(ggplot2)
library(plotly)
linear_reg_gg = ggplot(mtcars, aes(wt, mpg))+
geom_point() + geom_smooth(method="lm", color="red",se=FALSE) +
geom_segment(aes(xend = wt, yend = (lm(mpg~wt,data = mtcars))$fitted), linetype="dashed") +
xlim(0,5.5)+
xlab('Weight (1000 lbs)') + ylab('Miles Per Gallon')
ggplotly(linear_reg_gg)
## `geom_smooth()` using formula = 'y ~ x'
Following plots is used to check the assumptions for the Linear Regression is Satisfied.
layout(matrix(1:4, byrow = T, ncol = 2))
plot(simple_linear_regression, which = 1:4)
Let’s add more input variables (or predictor variables) to the linear regression model.
\[ MPG = \beta_0 + \beta_1 Weight + \beta_2 ~ Cylinders + \beta_3 ~Rear\_axle\_Ratio + Error \]
multiple_linear_regression = lm(formula = mpg ~ wt + cyl + drat ,data = mtcars)
summary(multiple_linear_regression)
##
## Call:
## lm(formula = mpg ~ wt + cyl + drat, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.2944 -1.5576 -0.4667 1.5678 6.1014
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.7677 6.8729 5.786 3.26e-06 ***
## wt -3.1947 0.8293 -3.852 0.000624 ***
## cyl -1.5096 0.4464 -3.382 0.002142 **
## drat -0.0162 1.3231 -0.012 0.990317
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.613 on 28 degrees of freedom
## Multiple R-squared: 0.8302, Adjusted R-squared: 0.812
## F-statistic: 45.64 on 3 and 28 DF, p-value: 6.569e-11
Since the Rear axle ratio is not significance (p-value
> 0.05), we can remove that variable and run the model again. \[ MPG = \beta_0 + \beta_1 Weight + \beta_2 ~
Cylinders + Error \]
multiple_linear_regression = lm(formula = mpg ~ wt + cyl ,data = mtcars)
summary(multiple_linear_regression)
##
## Call:
## lm(formula = mpg ~ wt + cyl, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.2893 -1.5512 -0.4684 1.5743 6.1004
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.6863 1.7150 23.141 < 2e-16 ***
## wt -3.1910 0.7569 -4.216 0.000222 ***
## cyl -1.5078 0.4147 -3.636 0.001064 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.568 on 29 degrees of freedom
## Multiple R-squared: 0.8302, Adjusted R-squared: 0.8185
## F-statistic: 70.91 on 2 and 29 DF, p-value: 6.809e-12
Full Model
full_model_linear_regression = lm(formula = mpg ~ . ,data = mtcars)
summary(full_model_linear_regression)
##
## Call:
## lm(formula = mpg ~ ., data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4506 -1.6044 -0.1196 1.2193 4.6271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.30337 18.71788 0.657 0.5181
## cyl -0.11144 1.04502 -0.107 0.9161
## disp 0.01334 0.01786 0.747 0.4635
## hp -0.02148 0.02177 -0.987 0.3350
## drat 0.78711 1.63537 0.481 0.6353
## wt -3.71530 1.89441 -1.961 0.0633 .
## qsec 0.82104 0.73084 1.123 0.2739
## vs 0.31776 2.10451 0.151 0.8814
## am 2.52023 2.05665 1.225 0.2340
## gear 0.65541 1.49326 0.439 0.6652
## carb -0.19942 0.82875 -0.241 0.8122
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared: 0.869, Adjusted R-squared: 0.8066
## F-statistic: 13.93 on 10 and 21 DF, p-value: 3.793e-07
Backward Elimination
This method starts with the full model and eliminates unimportant predictor variables from the model.
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
##
## select
backward_model = stepAIC(full_model_linear_regression, direction = "backward", trace = FALSE)
summary(backward_model)
##
## Call:
## lm(formula = mpg ~ wt + qsec + am, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4811 -1.5555 -0.7257 1.4110 4.6610
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.6178 6.9596 1.382 0.177915
## wt -3.9165 0.7112 -5.507 6.95e-06 ***
## qsec 1.2259 0.2887 4.247 0.000216 ***
## am 2.9358 1.4109 2.081 0.046716 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.459 on 28 degrees of freedom
## Multiple R-squared: 0.8497, Adjusted R-squared: 0.8336
## F-statistic: 52.75 on 3 and 28 DF, p-value: 1.21e-11
Stepwise Method
This method starts with the one variable and add or removes predictor variables systematically.
library(MASS)
stepwise_model = stepAIC(full_model_linear_regression, direction = "both", trace = FALSE)
summary(stepwise_model)
##
## Call:
## lm(formula = mpg ~ wt + qsec + am, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4811 -1.5555 -0.7257 1.4110 4.6610
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.6178 6.9596 1.382 0.177915
## wt -3.9165 0.7112 -5.507 6.95e-06 ***
## qsec 1.2259 0.2887 4.247 0.000216 ***
## am 2.9358 1.4109 2.081 0.046716 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.459 on 28 degrees of freedom
## Multiple R-squared: 0.8497, Adjusted R-squared: 0.8336
## F-statistic: 52.75 on 3 and 28 DF, p-value: 1.21e-11
Partition the data set into decision boundaries.
library(rpart)
library(rpart.plot)
ctrl = list(cp = 0.00001, minbucket = 1, maxdepth = 2)
tree = rpart(mpg ~ wt+cyl, data = mtcars,control = ctrl, method = 'anova')
rpart.plot(tree)
library(ggplot2)
ggplot(mtcars, aes(y=wt, x=cyl, color=mpg)) +
geom_point() +
geom_hline(yintercept=2.3, linetype="dashed",
color = "red", linewidth=1) +
geom_vline(xintercept=7, linetype="dashed",
color = "purple", linewidth=1)
R code for Random Forest Regression
library(randomForest)
Random_Forest_model = randomForest(formula = mpg ~ wt+cyl+drat, data = mtcars, ntree=500, importance=TRUE)
important_features = as.data.frame(importance(Random_Forest_model))
important_features$feature_name = row.names(important_features)
ggplot(important_features, aes(x=feature_name, y=`%IncMSE`)) +
geom_segment( aes(x=feature_name, xend=feature_name, y=0, yend=`%IncMSE`), color="orange") +
geom_point(aes(size = IncNodePurity), color="darkorange", alpha=0.9) +
theme_minimal() +
coord_flip() +
theme( legend.position="None") +
xlab('Features') + ylab('Importance')
We will go through two classifications models in this section.
Heart Disease Data SetIn this section, heart disease (heart_df) data set is
used to apply classification models.
cholesterol = c(180,200,195,245,205,130,155,260,175,286,203,192, 256, 294)
age = c(30,44,32,23,57,62,34,37,30,73,40,62, 37, 66)
heart_disease = as.numeric(c('0','1','1','1','1','0','0','1','0','1','0','1','1','1'))
heart_df = data.frame(cholesterol,age,heart_disease)
heart_df
## cholesterol age heart_disease
## 1 180 30 0
## 2 200 44 1
## 3 195 32 1
## 4 245 23 1
## 5 205 57 1
## 6 130 62 0
## 7 155 34 0
## 8 260 37 1
## 9 175 30 0
## 10 286 73 1
## 11 203 40 0
## 12 192 62 1
## 13 256 37 1
## 14 294 66 1
heart_plot = ggplot(data = heart_df, mapping = aes(x = cholesterol, y = age, col = as.factor(heart_disease) )) +
geom_point()+
guides(color = guide_legend(title = "Heart Disease"))+
xlab('Cholestoral') + ylab('Age in Years')
ggplotly(heart_plot)
Display Data:
library(ggplot2)
heart_plot = ggplot(data = heart_df, mapping = aes(x = cholesterol, y = heart_disease)) +
geom_point()+
geom_smooth(method = "glm", method.args = list(family = "binomial"), se = FALSE) +
xlab('Cholesterol') + ylab('Heart Disease')
ggplotly(heart_plot)
## `geom_smooth()` using formula = 'y ~ x'
\[ Logit\{Y\}=\beta_0+\beta_1~Cholesterol\] \[ Y= \frac{exp(\beta_0+\beta_1~Cholesterol)}{1+exp(\beta_0+\beta_1~Cholesterol)}\]
logistic_regression = glm(formula = heart_disease ~ cholesterol ,data = heart_df,family = 'binomial')
summary(logistic_regression)
##
## Call:
## glm(formula = heart_disease ~ cholesterol, family = "binomial",
## data = heart_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -25.24143 17.94588 -1.407 0.160
## cholesterol 0.13247 0.09289 1.426 0.154
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 18.2492 on 13 degrees of freedom
## Residual deviance: 7.2165 on 12 degrees of freedom
## AIC: 11.216
##
## Number of Fisher Scoring iterations: 8
\[ Logit\{Y\}=\beta_0+\beta_1~Cholesterol + \beta_2 Age\] \[ Y= \frac{exp(\beta_0+\beta_1~Cholesterol+ \beta_2 Age)}{1+exp(\beta_0+\beta_1~Cholesterol+ \beta_2 Age)}\]
logistic_regression = glm(formula = heart_disease ~ cholesterol + age ,
data = heart_df,family = 'binomial')
summary(logistic_regression)
##
## Call:
## glm(formula = heart_disease ~ cholesterol + age, family = "binomial",
## data = heart_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -24.31146 14.88515 -1.633 0.102
## cholesterol 0.10649 0.07342 1.450 0.147
## age 0.09857 0.10119 0.974 0.330
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 18.2492 on 13 degrees of freedom
## Residual deviance: 6.0047 on 11 degrees of freedom
## AIC: 12.005
##
## Number of Fisher Scoring iterations: 8
Get Odd Rations from Parameters Estimates and interpretations:
exp(coef(logistic_regression))
## (Intercept) cholesterol age
## 2.764816e-11 1.112364e+00 1.103590e+00
The odds of getting a heart disease will increase by 11% (i.e., exp(0.10)=1.11) when the Cholesterol level increases by 1 unit.
The odds of getting heart disease will increase by 10% (i.e., exp(0.09)=1.10) when the age increases by 1 year.
Predict for new set of data with Logistic Regression:
predict(logistic_regression,
newdata = data.frame(cholesterol = c(150, 250), age = c(25,65)),
type = "response")
## 1 2
## 0.002803361 0.999836308
library(randomForest)
Random_Forest_Classifier = randomForest(as.factor(heart_disease) ~ cholesterol + age, data=heart_df, ntree=500, importance = TRUE)
important_features = as.data.frame(importance(Random_Forest_Classifier))
important_features$feature_name <- row.names(important_features)
ggplot(important_features, aes(x=feature_name, y=`MeanDecreaseGini`)) +
geom_segment( aes(x=feature_name, xend=feature_name, y=0, yend=`MeanDecreaseGini`), color="orange") +
geom_point(aes(size = MeanDecreaseAccuracy), color="darkorange", alpha=0.9) +
theme_minimal() +
coord_flip() +
theme(legend.position = "none") +
xlab('Features') + ylab('Importance')
Predict New Data with Random Forest:
predict( Random_Forest_Classifier,
data.frame(cholesterol = c(150, 250), age = c(25,65)))
## 1 2
## 0 1
## Levels: 0 1
Confusion matrix, Sensitivity and Specificity
library(caret)
random_forest_prediction = predict(Random_Forest_Classifier)
observed_data = as.factor(heart_df$heart_disease)
confusionMatrix( random_forest_prediction, observed_data,positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3 2
## 1 2 7
##
## Accuracy : 0.7143
## 95% CI : (0.419, 0.9161)
## No Information Rate : 0.6429
## P-Value [Acc > NIR] : 0.4007
##
## Kappa : 0.3778
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.7778
## Specificity : 0.6000
## Pos Pred Value : 0.7778
## Neg Pred Value : 0.6000
## Prevalence : 0.6429
## Detection Rate : 0.5000
## Detection Prevalence : 0.6429
## Balanced Accuracy : 0.6889
##
## 'Positive' Class : 1
##
ROC Curve
library(pROC)
roc_logistic <- roc(heart_df$heart_disease, as.numeric(predict(logistic_regression)),smoothed = TRUE,plot=TRUE, grid=TRUE,print.auc=TRUE)
roc_random_forest <- roc(heart_df$heart_disease, as.numeric(random_forest_prediction),smoothed = TRUE,plot=TRUE, grid=TRUE,print.auc=TRUE)
#plot(roc_logistic, main = "ROC Curve")
#lines(roc_random_forest,col='blue')
Let’s use mtcars data set to apply train-test split. We
split as train data set will have 70% and test data set will have
30%.
set.seed(1234)
sample <- sample(c(TRUE, FALSE), nrow(mtcars), replace=TRUE, prob=c(0.7,0.3))
mtcars_train <- mtcars[sample, ]
mtcars_test <- mtcars[!sample, ]
Train Data Set Size
dim(mtcars_train)
## [1] 26 11
Test Data Set Size
dim(mtcars_test)
## [1] 6 11
Apply k-fold cross validation for mtcars training data
set.
Cross Validation with Linear Regression
library(caret)
cross_validate_setup <- trainControl(method = "cv", number = 4) # Cross validation with 4 folds
linear_regression_cv = train(mpg ~ .,
data = mtcars_train,
trControl = cross_validate_setup,
method = "lm",
metric = 'RMSE' # RMSE is used to compare the model
)
linear_regression_cv
## Linear Regression
##
## 26 samples
## 10 predictors
##
## No pre-processing
## Resampling: Cross-Validated (4 fold)
## Summary of sample sizes: 19, 19, 20, 20
## Resampling results:
##
## RMSE Rsquared MAE
## 3.765965 0.7585956 2.974284
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
Cross Validation with Random Forest Regression
library(caret)
cross_validate_setup <- trainControl(method = "cv", number = 4) # Cross validation with 4 folds
random_forest_regression_cv = train(mpg ~ .,
data = mtcars_train,
trControl = cross_validate_setup,
method = "rf", # Random Forest
metric = 'RMSE',
tuneGrid = expand.grid(.mtry = c(1 : 10)), #Grid Search with mtry from 1 to 10
na.action = na.pass)
random_forest_regression_cv
## Random Forest
##
## 26 samples
## 10 predictors
##
## No pre-processing
## Resampling: Cross-Validated (4 fold)
## Summary of sample sizes: 18, 21, 20, 19
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 1 2.685053 0.8574958 2.111840
## 2 2.426804 0.8694192 1.875677
## 3 2.376424 0.8723288 1.751273
## 4 2.366303 0.8671525 1.705216
## 5 2.340886 0.8646725 1.654538
## 6 2.316080 0.8670054 1.647052
## 7 2.386482 0.8635178 1.702981
## 8 2.315735 0.8655307 1.658174
## 9 2.344355 0.8587729 1.674820
## 10 2.283882 0.8637430 1.647277
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 10.
Random Forest Regression with mtry = 10 is the best
model compared to the Linear Regression as the RMSE is the lowest.
Lets apply K-Means Clustering for mtcars data
set
set.seed(123)
k_means_clustering = kmeans(mtcars, centers = 2)
print(k_means_clustering$cluster)
## Mazda RX4 Mazda RX4 Wag Datsun 710 Hornet 4 Drive
## 1 1 1 1
## Hornet Sportabout Valiant Duster 360 Merc 240D
## 2 1 2 1
## Merc 230 Merc 280 Merc 280C Merc 450SE
## 1 1 1 2
## Merc 450SL Merc 450SLC Cadillac Fleetwood Lincoln Continental
## 2 2 2 2
## Chrysler Imperial Fiat 128 Honda Civic Toyota Corolla
## 2 1 1 1
## Toyota Corona Dodge Challenger AMC Javelin Camaro Z28
## 1 2 2 2
## Pontiac Firebird Fiat X1-9 Porsche 914-2 Lotus Europa
## 2 1 1 1
## Ford Pantera L Ferrari Dino Maserati Bora Volvo 142E
## 2 1 2 1
Plot K-Means Clustering using factoextra
library
library(factoextra)
fviz_cluster(k_means_clustering, data = mtcars)
The following two methods can be used to check best number of clusters.
fviz_nbclust(mtcars, kmeans, method = "silhouette")
fviz_nbclust(mtcars, kmeans, method = "wss")
Lets apply Hierarchical-Means Clustering for
mtcars data set
library(ggdendro)
set.seed(123)
mtcars_distance = dist(mtcars, method = 'euclidean')
h_clustering = hclust(mtcars_distance)
ggdendrogram(h_clustering)
Hierarchical-Means Clustering based heatmaps are popular in analyzing gene expression data
Data Source: https://bioconductor.org/packages/devel/data/experiment/html/celldex.html
library(BiocManager)
library(GEOquery)
library(pheatmap)
gse <- getGEO("GSE33126")
dim((exprs(gse[[1]])))
## [1] 48803 18
gene_exp = t(exprs(gse[[1]]))
data_subset <- as.matrix(gene_exp[,colSums(gene_exp)>500000])
dist = dist(t(data_subset) , diag=TRUE)
hc = hclust(dist)
dhc = as.dendrogram(hc)
#ggdendrogram(hc)
pheatmap(scale(data_subset))
Principal components reduce the data dimensions and allows to visualize the correlation structure of the data effectivly
library(FactoMineR)
pca_mtcars <- PCA(mtcars, graph = FALSE)
fviz_pca_biplot(pca_mtcars, repel = TRUE)